library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.5     ✔ purrr   0.3.4
## ✔ tibble  3.1.6     ✔ dplyr   1.0.8
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

Tidyverse is a dialect of R that consists of a range of R packages developed by Hadley Wickham’s team (and others). The main ones:

Quick note about piping

Piping comes with tidyverse in magrittr library. Function calls can be constructed like

numbers <- seq(1, 5)
mean(sqrt(numbers))
## [1] 1.676466

With the %>% character, you can express these as a flow of data through functions instead:

numbers %>% sqrt() %>% mean()
## [1] 1.676466

Onto tidyverse

Here I’ve given you some data. What we want to do is see how the performance per flowcell is doing.

Reading data

fastqc_data <- read_csv("data/fastqc_data.csv")
## Rows: 131672 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): filename
## dbl (7): Base, Mean, Median, Lower.Quartile, Upper.Quartile, X10th.Percentil...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

We can quickly inspect this to take a look at the layout

summary(fastqc_data)
##    filename              Base          Mean           Median   Lower.Quartile
##  Length:131672      Min.   :  1   Min.   :33.44   Min.   :37   Min.   :37    
##  Class :character   1st Qu.: 38   1st Qu.:35.33   1st Qu.:37   1st Qu.:37    
##  Mode  :character   Median : 76   Median :35.73   Median :37   Median :37    
##                     Mean   : 76   Mean   :35.65   Mean   :37   Mean   :37    
##                     3rd Qu.:114   3rd Qu.:36.02   3rd Qu.:37   3rd Qu.:37    
##                     Max.   :151   Max.   :36.65   Max.   :37   Max.   :37    
##  Upper.Quartile X10th.Percentile X90th.Percentile
##  Min.   :37     Min.   :25.00    Min.   :37      
##  1st Qu.:37     1st Qu.:37.00    1st Qu.:37      
##  Median :37     Median :37.00    Median :37      
##  Mean   :37     Mean   :34.67    Mean   :37      
##  3rd Qu.:37     3rd Qu.:37.00    3rd Qu.:37      
##  Max.   :37     Max.   :37.00    Max.   :37
glimpse(fastqc_data)
## Rows: 131,672
## Columns: 8
## $ filename         <chr> "NA12878--TWE-0-EGG4_S32_L001_R1_001.stats-fastqc.txt…
## $ Base             <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
## $ Mean             <dbl> 35.89361, 35.74607, 35.96512, 36.09220, 36.13285, 36.…
## $ Median           <dbl> 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 3…
## $ Lower.Quartile   <dbl> 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 3…
## $ Upper.Quartile   <dbl> 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 3…
## $ X10th.Percentile <dbl> 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 3…
## $ X90th.Percentile <dbl> 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 3…
head(fastqc_data)

We also have some metadata, which we’ll read in:

metadata <- read_csv("data/metadata.csv")
## Rows: 878 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): project_id, project_name, file_id, file_name
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(metadata)
##   project_id        project_name         file_id           file_name        
##  Length:878         Length:878         Length:878         Length:878        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character
glimpse(metadata)
## Rows: 878
## Columns: 4
## $ project_id   <chr> "project-G3VKyBj46XbQ5GQV1304VYVK", "project-G3VKyBj46XbQ…
## $ project_name <chr> "002_210702_A01303_0011_BHFYVFDRXY_TWE", "002_210702_A013…
## $ file_id      <chr> "file-G3Xk8f04K8ppgZbVFk9KF7xQ", "file-G3Xk8J04K8pQ54QV6Q…
## $ file_name    <chr> "X218560-GM2106048-TWE-N-EGG4_S7_L002_R2_001.stats-fastqc…
head(metadata)

file name is the common element between them.

Since we need the project name, we can do some work:

dataset <- left_join(fastqc_data, metadata, by = c("filename" = "file_name")) %>% unique()

dataset

There’s some cleaning to do.

1: extract extra features 2: address the basecalling thing 3: resolve duplicates

First, let’s see what features we can extract from the filename. Looks like:

From the project, we can also get:

We’ll start with the filename features. separate is our friend here - it creates new variables.

NA12878–TWE-0-EGG4_S32_L001_R1_001

data_2 <- dataset %>%
   separate(filename, sep = "_", into = c("sample_name", "sample_number", "lane", "read", "something_else"))
data_2

We’ll do something similar with the project. separate will be a bit more difficult here because the flowcell barcode is split with the same delimiter as the rest. This time, we’ll create new columns with extract, and regexes

002_210913_A01295_0024_AHL72CDRXY_TWE

data_3 <- data_2 %>%
  extract(col = project_name, into = c("project_type", "run_name", "assay"), regex = "(\\d{3})_(.*)_(\\w+)$")

data_3

Another problem is that the Base field starts at 1 for read 2. If we want to visualise the cycles as a continuous sequence from 1-302, we’ll want to add an offset to read 2 only. Here, mutate is our friend. We’ll make a new column called cycle for this purpose.

data_4 <- data_3 %>%
  mutate(cycle = ifelse(read == "R1", Base, Base + 151))

data_4

It’s starting to get a bit difficult to look at, so let’s rearrange the columns using select.

data_5 <- data_4 %>%
  select(starts_with("sample"), starts_with("project"), assay, run_name, lane, read, Base, cycle, everything())

data_5

We can also use select to retain or drop columns. Some other things that you might want to do is create factors from categorical variables to help with ordering. But we won’t do this for now.

Something you may have noticed is that we appear to have gained rows since joining the datasets together, which is weird.

nrow(data_5) - nrow(fastqc_data)
## [1] 906

How are we going to find these?

Let’s use grouped filtering to find the odd ones out.

Here’s a quick filter to show you how it works:

data_5 %>%
  filter(sample_name == "NA12878--TWE-0-EGG4")

We can use group_by, which groups your data by any variables you have.

data_5 %>%
  group_by(sample_name, sample_number, lane, read, Base, something_else, Mean, Median, Lower.Quartile) %>%
  mutate(count = n()) %>%
  filter(count >= 2) %>%
  select(sample_name, Base, Mean, file_id, everything())

The data is unique except for the file ID, probably due to multiple invocations of FASTQC on the same data. We can try removing the file ID, unique-ifying the result, and comparing with the original again.

data_6 <- data_5 %>%
  select(-file_id) %>%
  unique()

nrow(data_6) - nrow(fastqc_data)
## [1] 0

It worked :)

analysis

We can start doing some analysis. Couple of ideas:

We’ll use ggplot for the first.

ggplot stands for the grammar of graphics: i.e., you build up visualisations by specifying things in order:

theme_set(theme_bw())

ggplot(data = data_6, aes(x = cycle, y = Mean, group = paste0(sample_name, run_name, lane))) +
  geom_line(aes(colour = run_name), alpha = 0.7)

It’s ok. But we can do better.

cycleplot <- ggplot(data = data_6, aes(x = cycle, y = Mean, group = paste0(run_name, lane))) +
  stat_summary(geom = "errorbar", fun = mean, fun.min = min, fun.max = max, aes(colour = run_name), alpha = 0.15) +
  stat_summary(geom = "line", fun = mean, aes(colour = run_name))

cycleplot

cycleplot +
  facet_wrap(~ lane)

ggplot(data = data_5, aes(x = cycle, y = Mean, group = sample_name)) +
  stat_identity(geom = "line", aes(colour = run_name), alpha = 0.7)

ggplot(data = data_5, aes(x = cycle, y = Mean, group = run_name)) +
  stat_summary(geom = "errorbar", fun = mean, fun.min = min, fun.max = max, aes(colour = run_name), alpha = 0.3) +
  stat_summary(geom = "line", fun = mean, aes(colour = run_name)) +
  facet_wrap(~ lane)

Summarising last 10 cycles

data_5 %>%
  filter(cycle >= 292) %>%
  group_by(sample_name, run_name, lane) %>%
  summarise(m = mean(Mean)) %>%
  ungroup() %>%
  ggplot(aes(x = m)) +
    geom_density(aes(colour = run_name, fill = run_name, group = str_c(run_name, lane)), alpha = 0.5) +
    facet_grid(lane ~ .)
## `summarise()` has grouped output by 'sample_name', 'run_name'. You can override
## using the `.groups` argument.